Setup the Environment

CSS Styling
writeLines("td, th { padding : 3px } th { background-color:white; color:black; border:1px solid black; text-align:center } td {color:black; border:1px solid black; word-wrap:break-word; white-space:nowrap; overflow: hidden; text-overflow: ellipsis; max-width:300px; text-align:left}", con= "../css/style.css")

Comparison of Samples and Metabolites Between Replicate Pairs

Boxplots (Original Abundances)

# Plot Boxplots faceted by shared metabolites
################################################################################
p <- plot_df %>%
  ggplot(aes(y = METABOLITE_NAME, x = VALUE, color = REPLICATE)) +
  geom_boxplot(alpha = 0.8) +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Original Metabolite Abundances"),
             x = "Abundance", y = "") 
plot(p)

Zero/Missing Values in Metabolites (Original Abundances)

# Identify Zero/Missing Values in Metabolites
################################################################################
na_df <- plot_df %>%
  ungroup() %>% 
  group_by(METABOLITE_NAME, REPLICATE) %>%
  mutate(ZERO_LOG = ifelse(VALUE == 0, 1, 0)) %>%
  mutate(ZERO_N = sum(ZERO_LOG)) %>%
  mutate(ZERO_FREQ = round(ZERO_N/length(shared_sams), digits = 2)) %>%
  mutate(NA_LOG = ifelse(is.na(VALUE), 1, 0)) %>%
  mutate(NA_N = sum(NA_LOG)) %>%
  mutate(NA_FREQ = round(NA_N/length(shared_sams), digits = 2)) %>%
  select(REPLICATE, METABOLITE_NAME, ZERO_N, ZERO_FREQ, NA_N, NA_FREQ) %>%
  unique() %>%
  arrange(REPLICATE, desc(ZERO_FREQ), desc(NA_FREQ)) %>%
  ungroup()

na_df %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "400px")
REPLICATE METABOLITE_NAME ZERO_N ZERO_FREQ NA_N NA_FREQ
811 Homocysteine 71 0.91 0 0
811 1-Methylhistidine 45 0.58 0 0
811 Carnosine 45 0.58 0 0
811 3-Methylhistidine 19 0.24 0 0
811 Arginine 4 0.05 0 0
811 Cysteine 1 0.01 0 0
811 Alanine 0 0.00 0 0
811 Anserine 0 0.00 0 0
811 Asparagine 0 0.00 0 0
811 Aspartic acid 0 0.00 0 0
811 Citrulline 0 0.00 0 0
811 Cystathionine 0 0.00 0 0
811 Ethanolamine 0 0.00 0 0
811 Glutamic acid 0 0.00 0 0
811 Glutamine 0 0.00 0 0
811 Glycine 0 0.00 0 0
811 Histidine 0 0.00 0 0
811 Hydroxyproline 0 0.00 0 0
811 Isoleucine 0 0.00 0 0
811 Leucine 0 0.00 0 0
811 Lysine 0 0.00 0 0
811 Methionine 0 0.00 0 0
811 Ornithine 0 0.00 0 0
811 Phenylalanine 0 0.00 0 0
811 Phosphoethanolamine 0 0.00 0 0
811 Proline 0 0.00 0 0
811 Sarcosine 0 0.00 0 0
811 Serine 0 0.00 0 0
811 Taurine 0 0.00 0 0
811 Threonine 0 0.00 0 0
811 Tryptophan 0 0.00 0 0
811 Tyrosine 0 0.00 0 0
811 Valine 0 0.00 0 0
811 2-Aminobutyric acid 0 0.00 0 0
811 Aminoadipic acid 0 0.00 0 0
811 beta-Alanine 0 0.00 0 0
811 Aminoisobutyric acid 0 0.00 0 0
811 gamma-Aminobutyric acid 0 0.00 0 0
812 Homocysteine 69 0.88 0 0
812 1-Methylhistidine 49 0.63 0 0
812 Carnosine 39 0.50 0 0
812 3-Methylhistidine 28 0.36 0 0
812 Arginine 2 0.03 0 0
812 Cysteine 1 0.01 0 0
812 Alanine 0 0.00 0 0
812 Anserine 0 0.00 0 0
812 Asparagine 0 0.00 0 0
812 Aspartic acid 0 0.00 0 0
812 Citrulline 0 0.00 0 0
812 Cystathionine 0 0.00 0 0
812 Ethanolamine 0 0.00 0 0
812 Glutamic acid 0 0.00 0 0
812 Glutamine 0 0.00 0 0
812 Glycine 0 0.00 0 0
812 Histidine 0 0.00 0 0
812 Hydroxyproline 0 0.00 0 0
812 Isoleucine 0 0.00 0 0
812 Leucine 0 0.00 0 0
812 Lysine 0 0.00 0 0
812 Methionine 0 0.00 0 0
812 Ornithine 0 0.00 0 0
812 Phenylalanine 0 0.00 0 0
812 Phosphoethanolamine 0 0.00 0 0
812 Proline 0 0.00 0 0
812 Sarcosine 0 0.00 0 0
812 Serine 0 0.00 0 0
812 Taurine 0 0.00 0 0
812 Threonine 0 0.00 0 0
812 Tryptophan 0 0.00 0 0
812 Tyrosine 0 0.00 0 0
812 Valine 0 0.00 0 0
812 2-Aminobutyric acid 0 0.00 0 0
812 Aminoadipic acid 0 0.00 0 0
812 beta-Alanine 0 0.00 0 0
812 Aminoisobutyric acid 0 0.00 0 0
812 gamma-Aminobutyric acid 0 0.00 0 0
# Remove Metabolites with high NAor Zero Frequencies
################################################################################
met_rm <- na_df %>%
  filter((ZERO_FREQ >= 0.95) | (NA_FREQ >= 0.8)) %>%
  select(METABOLITE_NAME) %>% unlist() %>% unique()
metabolites <- metabolites[metabolites %!in% met_rm]
plot_df <- plot_df %>%
  filter(METABOLITE_NAME %in% metabolites)
plot_df$METABOLITE_NAME <- as.character(plot_df$METABOLITE_NAME)

Distributions (Original Abundances)

# Summary Statistics of Density plot distribution
################################################################################
# Iterate through the runs and collect vectors
df_all <- data.frame()
for(rep in c(reps)){
  # Collect numeric vector
  num_vec <- plot_df %>%
    filter(REPLICATE == rep) %>%
    select(VALUE) %>% unlist() %>% unname()
  df <- NumericSummaryStats(num_vec)
  row.names(df) <- rep
  df_all <- rbind(df_all, df)
}
df_all %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "200px")
NA_COUNT NA_FREQ MEAN MEDIAN MAX MIN MID_RANGE VARIANCE STD_DEV SE_MEAN Q1 Q3 KURTOSIS SKEW BC_LAMBDA
811 0 0 4.02 0.86 73.84 0 73.84 80.12 8.95 0.16 0.1 3.19 16.92 3.86 None
812 0 0 3.99 0.86 75.94 0 75.94 75.37 8.68 0.16 0.1 3.18 15.50 3.69 None
# Plot the density plot for all the gene counts
################################################################################
plot_df %>%
  ggplot(aes(x = VALUE, color = REPLICATE)) +
  geom_density() +
  xlim(min(df_all$MIN), mean(df_all$Q3)) +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Original Metabolite Abundances"),
       x = "Abundance",
       y = "Density")

Metabolite by Metabolite Correlation (Original Abundances)

################################################################################
####################### Metabolite by Metabolite Correlations ##################
################################################################################
# Subset Matrices
x <- plot_df %>%
  unite(METABOLITE_NAME,REPLICATE, 
        col = METABOLITE_NAME_REPLICATE, remove = F) %>%
  select(METABOLITE_NAME_REPLICATE, labelid, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME_REPLICATE, values_from = VALUE) %>%
  map(arrange, labelid) %>%
  map(tibble::column_to_rownames, var = "labelid") %>%
  map(as.matrix)

# Subset matrices
if(all(row.names(x[[1]]) == row.names(x[[2]]))){
  shared_met_mat <-  do.call(cbind,x) 
}

Corr <- 'Spearman'
# Create an NxN Matrix of correlation
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_met_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_met_mat, method = 'spearman') %>% round(digits = 3)
}


# Reorder the correlation matrix
cormat <- cor1

# Melt the correlation matrix
melted_cormat <- melt(cormat, na.rm = TRUE)

################################################################################
####################### Only Reciprocal Pairs ##################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat 
# Widen the datafrmae back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = T), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(cormat2, color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

Scatterplots (Original Abundances)

# Subset the data for sites to compare (scatterplots)
###################################################
plot_join_df <- countdata_df %>%
  ungroup() %>%
  unite(STUDY_INSTITUTE,METAB_FAMILY, col = "STUDY_INSTITUTE_METAB_FAMILY") %>%
  filter(TISSUE == tissue) %>%
  filter(NAMED == named) %>%
  filter(STUDY_INSTITUTE_METAB_FAMILY == study_institute_metab_family) %>%
  unnest(COUNT_DATA) %>%
  filter(viallabel %in% as.character(unlist(sample_join$sample_id))) %>%
  filter(METABOLITE_NAME %in% metabolites) %>%
  unite(labelid, viallabel, col = "labelid_viallabel", remove = F) %>%
  mutate(REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                              grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                              )) %>%
  select(REPLICATE,METABOLITE_NAME,labelid,VALUE) %>%
  mutate(REPLICATE = as.character(REPLICATE)) %>%
  split(.$REPLICATE) %>%
  map(~rename(., !!sym(unique(.$REPLICATE)) := "VALUE")) %>%
  map(~select(., -REPLICATE)) %>%
  purrr::reduce(left_join, by = c("labelid","METABOLITE_NAME"))

# Plot scatter plots
################################################################################
# Plot scatter plots ordered by sample (include R2 values)
lm_df <- plot_join_df %>%
  rename(name = METABOLITE_NAME) %>%
  rename(x = reps[1]) %>%
  rename(y = reps[2])
# Iterate through each of the shared metabolites to collect summary info about their correlations
r2_df <- data.frame()
for(metab in metabolites){
  met_df <- lm_df %>%
    filter(name == metab)
  r2_df <- rbind(r2_df, lm_eqn(met_df))
}
# Display summaries
met_r2_df <- r2_df %>%
  arrange(METABOLITE) %>%
  select(METABOLITE,R2) %>%
  arrange(desc(R2))
met_r2_df %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "200px")
METABOLITE R2
2-Aminobutyric acid 0.913
Aminoadipic acid 0.894
Serine 0.728
Glutamine 0.706
Threonine 0.699
Aminoisobutyric acid 0.665
Alanine 0.654
Sarcosine 0.644
Lysine 0.638
Glutamic acid 0.605
Aspartic acid 0.597
Taurine 0.579
Glycine 0.568
Hydroxyproline 0.551
Cystathionine 0.489
3-Methylhistidine 0.487
Ornithine 0.459
Cysteine 0.305
Proline 0.298
Valine 0.277
Isoleucine 0.254
Tyrosine 0.246
Citrulline 0.23
beta-Alanine 0.207
Histidine 0.196
Tryptophan 0.173
gamma-Aminobutyric acid 0.146
Carnosine 0.121
Ethanolamine 0.121
Leucine 0.0946
Arginine 0.0774
Anserine 0.0758
Asparagine 0.0669
Methionine 0.0628
Phenylalanine 0.0324
Homocysteine 0.00266
1-Methylhistidine 0.000219
Phosphoethanolamine 0.000142
# Collect the order of metabolites
scat_met_order <- met_r2_df %>% select(METABOLITE) %>% unlist() %>% as.character()

# Plot the scatter plots
for(metab in scat_met_order){
  p <- plot_join_df %>%
    filter(METABOLITE_NAME == metab) %>%
  ggplot(aes(x = !!sym(reps[1]), y = !!sym(reps[2])), color = ) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", size = 1) +
  geom_abline(linetype="dashed") +
  facet_wrap(~ METABOLITE_NAME) +
  expand_limits(x = 0, y = 0) +
  #labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances")) +
  coord_fixed(ratio=1)
  plot(p)
}

Sample by Sample Correlations (Original Abundances)

################################################################################
####################### Sample by Sample Correlations ##########################
################################################################################

# Subset Data
################################################################################
# Original NxP Matrix
x <- plot_df %>%
  unite(labelid,REPLICATE, 
        col = labelid_REPLICATE, remove = F) %>%
  select(labelid_REPLICATE, METABOLITE_NAME, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(tibble::column_to_rownames, var = "labelid_REPLICATE") %>%
  map(as.matrix)
# Subset matrices
shared_sam_mat <-  do.call(rbind,x) %>% 
  t()

# Create an NxN Matrix of correlation
Corr <- 'Spearman'
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_sam_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_sam_mat, method = 'spearman') %>% round(digits = 3)
}

# Melt the correlation matrix
melted_cormat <- melt(cor1, na.rm = TRUE)

################################################################################
####################### Unclustered ############################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat

# Widen the dataframwe back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = TRUE), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(as.matrix(cormat2), color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

PCA Analysis (Original Abundances)

pca <- prcomp(na.omit(t(shared_sam_mat)), scale. = F)
percentVar <- pca$sdev^2/sum(pca$sdev^2)
PC <- pca$x %>% as.data.frame()
PC$labelid_rep <- as.character(row.names(pca$x))
PC <- PC %>%
  tidyr::separate(col = labelid_rep, into = c('labelid', 'REPLICATE'),
  sep = "_", remove = T)
# Join the phenotype data
PC <- left_join(PC, pheno_df, by = 'labelid')

p0 <- PC %>%
  ggplot(aes(x = PC1, y = PC2, color=REPLICATE)) +
  geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
  ggtitle('Shared Samples and Shared Metabolites') + 
  xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
  ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
  theme(aspect.ratio=1)
p0

p1 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Key.anirandgroup, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        scale_color_manual(values=ec_colors) +
        theme(aspect.ratio=1)
p1

PC$Registration.sex <- as.character(PC$Registration.sex)
p2 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Registration.sex, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        theme(aspect.ratio=1)
p2

Normalization (AutoScale)

# Normalize
# Note in this step, we use labelid to account for mayo's duplicate samples
################################################################################
norm_df <- plot_df %>%
  unite(labelid, viallabel, col = "labelid_viallabel", remove = F) %>%
  split(.$REPLICATE) %>%
  map(select, labelid_viallabel, METABOLITE_NAME, VALUE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(column_to_rownames, var = "labelid_viallabel") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix) %>%
  map(as.data.frame) %>%
  map(rownames_to_column, var = "labelid_viallabel") %>%
  map(pivot_longer, names_to = 'METABOLITE_NAME', values_to = 'VALUE', cols = all_of(metabolites)) %>%
  map(mutate, REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                                     grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                                    grepl(paste0(reps[3],"$"),labelid_viallabel) ~ reps[3])) %>%
  bind_rows()

Boxplots (Normalized Abundances)

# Plot Boxplots faceted by shared metabolites
################################################################################
norm_df %>%
  ggplot(aes(y = METABOLITE_NAME, x = VALUE, color = REPLICATE)) +
  geom_boxplot(alpha = 0.8) +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances"),
             x = "Abundance", y = "") 

Metabolite by Metabolite Correlation (Normalized Abundances)

################################################################################
####################### Metabolite by Metabolite Correlations ##################
################################################################################
# Subset Matrices
x <- plot_df %>%
  unite(METABOLITE_NAME,REPLICATE, 
        col = METABOLITE_NAME_REPLICATE, remove = F) %>%
  select(METABOLITE_NAME_REPLICATE, labelid, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME_REPLICATE, values_from = VALUE) %>%
  map(arrange, labelid) %>%
  map(tibble::column_to_rownames, var = "labelid") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix)

# Subset matrices
if(all(row.names(x[[1]]) == row.names(x[[2]]))){
  shared_met_mat <-  do.call(cbind,x) 
}

Corr <- 'Spearman'
# Create an NxN Matrix of correlation
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_met_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_met_mat, method = 'spearman') %>% round(digits = 3)
}

# Reorder the correlation matrix
cormat <- cor1

# Melt the correlation matrix
melted_cormat <- melt(cormat, na.rm = TRUE)

################################################################################
####################### Only Reciprocal Pairs ##################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat 
# Widen the datafrmae back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = T), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(cormat2, color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

Scatterplots (Normalized Abundances)

# Normalize
# Subset the data for sites to compare (scatterplots)
###################################################
norm_join_df <- countdata_df %>%
  ungroup() %>%
  unite(STUDY_INSTITUTE,METAB_FAMILY, col = "STUDY_INSTITUTE_METAB_FAMILY") %>%
  filter(TISSUE == tissue) %>%
  filter(NAMED == named) %>%
  filter(STUDY_INSTITUTE_METAB_FAMILY == study_institute_metab_family) %>%
  unnest(COUNT_DATA) %>%
  filter(viallabel %in% as.character(unlist(sample_join$sample_id))) %>%
  unite(labelid, viallabel, col = "labelid_viallabel", remove = F) %>%
  mutate(REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                                     grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                               grepl(paste0(reps[3],"$"),labelid_viallabel) ~ reps[3])) %>%
  split(.$REPLICATE) %>%
  map(select, labelid_viallabel, METABOLITE_NAME, VALUE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(column_to_rownames, var = "labelid_viallabel") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix) %>%
  map(as.data.frame) %>%
  map(rownames_to_column, var = "labelid_viallabel") %>%
  map(pivot_longer, names_to = 'METABOLITE_NAME', values_to = 'VALUE', cols = all_of(metabolites)) %>%
  map(mutate, REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                                     grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                                    grepl(paste0(reps[3],"$"),labelid_viallabel) ~ reps[3])) %>%
  map(~rename(., !!sym(unique(.$REPLICATE)) := "VALUE")) %>%
  map(~select(., -REPLICATE)) %>%
  map(mutate, labelid = gsub(pattern = "_..*", replacement = '', labelid_viallabel)) %>%
  map(select, -labelid_viallabel) %>%
  purrr::reduce(left_join, by = c("labelid","METABOLITE_NAME"))

# Plot scatter plots ordered by sample (include R2 values)
lm_df <- norm_join_df %>%
  rename(name = METABOLITE_NAME) %>%
  rename(x = reps[1]) %>%
  rename(y = reps[2])
# Iterate through each of the shared metabolites to collect summary info about their correlations
r2_df <- data.frame()
for(metab in metabolites){
  met_df <- lm_df %>%
    filter(name == metab)
  r2_df <- rbind(r2_df, lm_eqn(met_df))
}
# Display summaries
met_r2_df <- r2_df %>%
  arrange(METABOLITE) %>%
  select(METABOLITE,R2) %>%
  arrange(desc(R2))
met_r2_df %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "200px")
METABOLITE R2
2-Aminobutyric acid 0.913
Aminoadipic acid 0.894
Serine 0.728
Glutamine 0.706
Threonine 0.699
Aminoisobutyric acid 0.665
Alanine 0.654
Sarcosine 0.644
Lysine 0.638
Glutamic acid 0.605
Aspartic acid 0.597
Taurine 0.579
Glycine 0.568
Hydroxyproline 0.551
Cystathionine 0.489
3-Methylhistidine 0.487
Ornithine 0.459
Cysteine 0.305
Proline 0.298
Valine 0.277
Isoleucine 0.254
Tyrosine 0.246
Citrulline 0.23
beta-Alanine 0.207
Histidine 0.196
Tryptophan 0.173
gamma-Aminobutyric acid 0.146
Carnosine 0.121
Ethanolamine 0.121
Leucine 0.0946
Arginine 0.0774
Anserine 0.0758
Asparagine 0.0669
Methionine 0.0628
Phenylalanine 0.0324
Homocysteine 0.00266
1-Methylhistidine 0.000219
Phosphoethanolamine 0.000142
# Collect the order of metabolites
scat_met_order <- met_r2_df %>% select(METABOLITE) %>% unlist() %>% as.character()

# Plot scatter plots (Normalized)
################################################################################
# Plot the scatter plots
for(metab in scat_met_order){
  p <- norm_join_df %>%
    filter(METABOLITE_NAME == metab) %>%
  ggplot(aes(x = !!sym(reps[1]), y = !!sym(reps[2])), color = ) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", size = 1) +
  geom_abline(linetype="dashed") +
  facet_wrap(~ METABOLITE_NAME) +
  expand_limits(x = 0, y = 0) +
  #labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances")) +
  coord_fixed(ratio=1)
  plot(p)
}

# norm_join_df %>%
#   mutate(METABOLITE_NAME = factor(METABOLITE_NAME, levels = scat_met_order)) %>%
#   ggplot(aes(x = !!sym(reps[1]), y = !!sym(reps[2])), color = ) +
#   geom_point(alpha = 0.5) +
#   geom_smooth(method = "lm", size = 1) +
#   geom_abline(linetype="dashed") +
#   facet_wrap(~ METABOLITE_NAME) +
#   expand_limits(x = 0, y = 0) +
#   labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances")) +
#   coord_fixed(ratio=1)

Distributions (Normalized Abundances)

# Plot the density plot for all the gene counts
################################################################################
norm_df %>%
  ggplot(aes(x = VALUE, color = REPLICATE)) +
  geom_density() +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances"),
       x = "Abundance",
       y = "Density")

# Summary Statistics of Density plot distribution
################################################################################
# Iterate through the runs and collect vectors
df_all <- data.frame()
for(rep in reps){
  # Collect numeric vector
  num_vec <- norm_df %>%
    filter(REPLICATE == rep) %>%
    select(VALUE) %>% unlist() %>% unname()
  df <- NumericSummaryStats(num_vec)
  row.names(df) <- rep
  df_all <- rbind(df_all, df)
}
df_all
#}

Sample by Sample Correlations (Normalized Abundances)

################################################################################
####################### Sample by Sample Correlations ##########################
################################################################################

# Subset Data
################################################################################
# Original NxP Matrix
x <- plot_df %>%
  unite(labelid,REPLICATE, 
        col = labelid_REPLICATE, remove = F) %>%
  select(labelid_REPLICATE, METABOLITE_NAME, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(tibble::column_to_rownames, var = "labelid_REPLICATE") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix)
# Subset matrices
shared_sam_mat <-  do.call(rbind,x) %>% 
  t()

# Create an NxN Matrix of correlation
Corr <- 'Spearman'
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_sam_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_sam_mat, method = 'spearman') %>% round(digits = 3)
}

# Melt the correlation matrix
melted_cormat <- melt(cor1, na.rm = TRUE)

################################################################################
####################### Unclustered ############################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat

# Widen the dataframwe back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = TRUE), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(as.matrix(cormat2), color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

PCA Analysis (Normalized Abundances)

# Plot a PCA with outliers labeled
# shared_sam_mat <- shared_sam_mat %>%
#   t() %>% AutoScaleMatrix() %>%
#   t()
pca <- prcomp(na.omit(t(shared_sam_mat)), scale. = F)
percentVar <- pca$sdev^2/sum(pca$sdev^2)
PC <- pca$x %>% as.data.frame()
PC$labelid_rep <- as.character(row.names(pca$x))
PC <- PC %>%
  tidyr::separate(col = labelid_rep, into = c('labelid', 'REPLICATE'),
  sep = "_", remove = T)
# Join the phenotype data
PC <- left_join(PC, pheno_df, by = 'labelid')

p0 <- PC %>%
  ggplot(aes(x = PC1, y = PC2, color=REPLICATE)) +
  geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
  ggtitle('Shared Samples and Shared Metabolites') + 
  xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
  ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
  theme(aspect.ratio=1)
p0

p1 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Key.anirandgroup, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        scale_color_manual(values=ec_colors) +
        theme(aspect.ratio=1)
p1

PC$Registration.sex <- as.character(PC$Registration.sex)
p2 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Registration.sex, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        theme(aspect.ratio=1)
p2